Explore continued

librarian::shelf(
  DT, dplyr, dismo, GGally, here, readr, tidyr)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)

dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 20000
datatable(pts_env, rownames = F)

Creating pairs plots using GGally to show correlations between variables. Halina notes: consider dropping ER_topoWet and ER_tri

GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 59 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 18 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 64 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning: Removed 16 rows containing missing values (geom_point).

## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 64 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning: Removed 16 rows containing missing values (geom_point).

## Warning: Removed 16 rows containing missing values (geom_point).

## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 64 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning: Removed 59 rows containing missing values (geom_point).
## Warning: Removed 64 rows containing missing values (geom_point).

## Warning: Removed 64 rows containing missing values (geom_point).

## Warning: Removed 64 rows containing missing values (geom_point).
## Warning: Removed 59 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 64 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 59 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 59 rows containing missing values
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).

## Warning: Removed 25 rows containing missing values (geom_point).

## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 64 rows containing missing values (geom_point).
## Warning: Removed 18 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 18 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 18 rows containing missing values
## Warning: Removed 16 rows containing missing values (geom_point).

## Warning: Removed 16 rows containing missing values (geom_point).

## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 59 rows containing missing values (geom_point).
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).

## Warning: Removed 16 rows containing missing values (geom_point).

## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 59 rows containing missing values (geom_point).
## Warning: Removed 18 rows containing missing values (geom_point).

2.1 Setup Data

# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19931

2.2 Linear Model

# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9845 -0.4210  0.1378  0.4142  1.2550 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.614e-01  7.994e-02   9.526  < 2e-16 ***
## WC_alt      -3.899e-05  1.167e-05  -3.340  0.00084 ***
## WC_bio1      3.529e-02  2.022e-03  17.455  < 2e-16 ***
## WC_bio2     -1.918e-02  1.955e-03  -9.810  < 2e-16 ***
## ER_tri      -3.639e-03  1.886e-04 -19.293  < 2e-16 ***
## ER_topoWet  -8.796e-02  3.889e-03 -22.616  < 2e-16 ***
## lon         -1.216e-03  2.850e-04  -4.267 1.99e-05 ***
## lat          1.528e-02  1.463e-03  10.446  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4672 on 19923 degrees of freedom
## Multiple R-squared:  0.1273, Adjusted R-squared:  0.1269 
## F-statistic:   415 on 7 and 19923 DF,  p-value: < 2.2e-16

Halina notes: predict() is creating a RasterLayer with a prediction based on a model, not 100% sure what the type argument is doing but I know we want the y-predict to be our response or y-values. But basically this is the part where we are running the logit regression so that our range is constrained between 0 and 1. We don’t want predictions outside of this range.

y_predict <- predict(mdl, d, type="response")
y_true    <- d$present

range(y_predict)
## [1] -0.2549701  0.9845352
range(y_true)
## [1] 0 1

The problem with these predictions is that it ranges outside the possible values of present 1 and absent 0. (Later we’ll deal with converting values within this range to either 1 or 0 by applying a cutoff value; i.e. any values > 0.5 become 1 and below become 0.)

Halina note: also my adjusted r squared is pretty small?

2.3 Generalized Linear Model

# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1521  -1.0310   0.4546   1.0167   2.6916  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.221e+00  3.794e-01   3.217 0.001296 ** 
## WC_alt      -1.636e-04  5.533e-05  -2.957 0.003110 ** 
## WC_bio1      1.636e-01  9.733e-03  16.810  < 2e-16 ***
## WC_bio2     -8.643e-02  9.373e-03  -9.221  < 2e-16 ***
## ER_tri      -1.793e-02  9.664e-04 -18.558  < 2e-16 ***
## ER_topoWet  -4.147e-01  1.881e-02 -22.046  < 2e-16 ***
## lon         -5.046e-03  1.310e-03  -3.853 0.000117 ***
## lat          7.361e-02  7.011e-03  10.499  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27630  on 19930  degrees of freedom
## Residual deviance: 24926  on 19923  degrees of freedom
## AIC: 24942
## 
## Number of Fisher Scoring iterations: 4
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 0.02648005 0.90191499

Excellent, our response is now constrained between 0 and 1. Next, let’s look at the term plots to see the relationship between predictor and response.

# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F)

2.4 Generalized Additive Model

librarian::shelf(mgcv)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.07374    0.02897  -2.546   0.0109 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df  Chi.sq p-value    
## s(WC_alt)     8.826  8.988  322.85  <2e-16 ***
## s(WC_bio1)    6.795  7.595  445.75  <2e-16 ***
## s(WC_bio2)    8.253  8.640  818.13  <2e-16 ***
## s(ER_tri)     8.690  8.921   73.91  <2e-16 ***
## s(ER_topoWet) 8.542  8.920   61.45  <2e-16 ***
## s(lon)        8.769  8.982 1025.23  <2e-16 ***
## s(lat)        8.901  8.994  361.79  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.405   Deviance explained = 34.9%
## UBRE = -0.092119  Scale est. = 1         n = 19931

Halina note: well my adjusted r squared definitely increased!

# show term plots
plot(mdl, scale=0)

2.5 Maxtent (maximum entropy)

# load extra packages
librarian::shelf(
  maptools, sf)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
# show version of maxent
maxent()
## Loading required namespace: rJava
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
mdl <- maxent(env_stack, obs_sp)
## Warning in .local(x, p, ...): 22 (0.55%) of the presence points have NA
## predictor values
## This is MaxEnt version 3.4.3

Halina notes: I’m concerned about this warning - I thought I got rid of the NA’s?

# plot variable contributions per predictor
plot(mdl)

Halina Notes: interesting that ER_tri and ER_topoWet contribute the least to each predictor because they are also variables I would consider removing since their correlation number was greater than 0.7.

# plot term plots
response(mdl)

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')